Import packages and data¶
%%capture
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
try:
from prettytable import PrettyTable
from pypfopt import HRPOpt
except:
!pip install -q - U prettytable
!pip install -q -U PyPortfolioOpt
from pypfopt import HRPOpt
from prettytable import PrettyTable
from google.colab import drive
from IPython.display import display_html, HTML, display
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.tree import export_graphviz
from IPython.display import Image
from scipy import stats
from statistics import multimode
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
from pypfopt import plotting
# get all data from GitHub repo
files = ['er','ed','futures','infswp','sofr','yields','ez_cds']
file_dict = {}
for file in files:
file_dict[file] = pd.read_csv('https://raw.githubusercontent.com/sjv1030/wq-capstone/main/data/'+file+'.csv')
file_dict[file]['Date'] = pd.to_datetime(file_dict[file]['Date'], format='%m/%d/%Y')
file_dict[file].set_index('Date', inplace=True)
file_dict[file] = file_dict[file].loc[file_dict[file].index.dropna()]
file_dict[file] = file_dict[file].loc[:'2024'].copy()
file_dict[file].columns = file_dict[file].columns.str.strip()
#print(file_dict[file].info())
#display(file_dict[file].head())
Data transformation¶
# Initial training period (year)
beg_yr = 2010
end_yr = 2012
Convert STIR into yield term¶
Cycle through all contracts and get difference between Eurodollar futures and SOFR futures. For each contract, identify the date where the absolute delta between the Eurodollar and SOFR contract prices is minimized. This date will be used to merge the two contracts in the next step.
merge_ed_sfr = dict()
for k in range(1,13):
_ = pd.concat([file_dict['ed'][['ED'+str(k)]], file_dict['sofr'][['SFR'+str(k)]]], axis=1)
_['delta'+str(k)] = np.abs(_['ED'+str(k)] - _['SFR'+str(k)])
_['delta'+str(k)].loc['2023':'2023'].dropna().plot(legend=True)
dates = (_['delta'+str(k)].loc['2023':'2023'].index[_['delta'+str(k)].loc['2023':'2023'].dropna().argmin()],
_['delta'+str(k)].loc['2023':'2023'].index[_['delta'+str(k)].loc['2023':'2023'].dropna().argmin()+1])
merge_ed_sfr[k] = dates
plt.show()
# merge ED and SFR futures
backdated_SFR = pd.DataFrame()
for k in range(1,13):
merge_date, merge_date_1 = merge_ed_sfr[k]
tmp = pd.concat([file_dict['ed']['ED'+str(k)].loc[:merge_date],
file_dict['sofr']['SFR'+str(k)].loc[merge_date_1:]], axis=0
)
backdated_SFR['SFR'+str(k)] = tmp
Calculate calendar spreads for the STIR futures. The result is in yield terms.
Short-term interest rate (STIR) futures are quoted in price terms. Given standard contract conventions, 100 less the price equates to a yield in percent form. Arguably, for macro/fixed-income investment ideas, the yield is more important.
Note that a term spread is usually calculated with the longer duration security first.
For example, the US Yield Curve can be defind as 10-year US Treasury less the 2-year US Treasury But with STIR futures, one can reverse the order to get the term spread in yield terms.
Example - Using prices from Jan 9, 2023:
- SFR4's price was 95.405 (or 4.595%)
- SFR8's price was 96.790 (or 3.21%)
One can calculate the term spread as SFR4 - SFR8, or -1.385% Note: the term spread is negative because the curve is inverted
# calculate STIR calendar spreads
tmp_dict = {}
for i in range(1,12):
for j in range(2,13):
if i < j:
tmp_dict['ER'+str(i)+'-'+str(j)] = file_dict['er']['ER'+str(i)] - file_dict['er']['ER'+str(j)]
tmp_dict['SFR'+str(i)+'-'+str(j)] = backdated_SFR['SFR'+str(i)] - backdated_SFR['SFR'+str(j)]
# save data to a dataframe
full_data = pd.DataFrame.from_dict(tmp_dict)
# calculate cross country spreads at key tenors and add to main dataframe
tmp_dict = {}
for i in range(4,13,2):
tmp_dict['SFRER'+str(i)] = backdated_SFR['SFR'+str(i)] - file_dict['er']['ER'+str(i)]
full_data = full_data.join(pd.DataFrame.from_dict(tmp_dict), how='left')
Create spreads¶
Below is a large dictionary holding all of the various spreads to be used as either a target or within the feature set.
file_dict['spread'] = pd.DataFrame()
file_dict['spread']['US302'] = file_dict['yields']['US30'] - file_dict['yields']['US2']
file_dict['spread']['US305'] = file_dict['yields']['US30'] - file_dict['yields']['US5']
file_dict['spread']['US3010'] = file_dict['yields']['US30'] - file_dict['yields']['US10']
file_dict['spread']['US102'] = file_dict['yields']['US10'] - file_dict['yields']['US2']
file_dict['spread']['US105'] = file_dict['yields']['US10'] - file_dict['yields']['US5']
file_dict['spread']['US52'] = file_dict['yields']['US5'] - file_dict['yields']['US2']
file_dict['spread']['DE302'] = file_dict['yields']['DE30'] - file_dict['yields']['DE2']
file_dict['spread']['DE305'] = file_dict['yields']['DE30'] - file_dict['yields']['DE5']
file_dict['spread']['DE3010'] = file_dict['yields']['DE30'] - file_dict['yields']['DE10']
file_dict['spread']['DE102'] = file_dict['yields']['DE10'] - file_dict['yields']['DE2']
file_dict['spread']['DE105'] = file_dict['yields']['DE10'] - file_dict['yields']['DE5']
file_dict['spread']['DE52'] = file_dict['yields']['DE5'] - file_dict['yields']['DE2']
file_dict['spread']['CA102'] = file_dict['yields']['CA10'] - file_dict['yields']['CA2']
file_dict['spread']['CA105'] = file_dict['yields']['CA10'] - file_dict['yields']['CA5']
file_dict['spread']['CA52'] = file_dict['yields']['CA5'] - file_dict['yields']['CA2']
file_dict['spread']['ITDE10'] = file_dict['yields']['IT10'] - file_dict['yields']['DE10']
file_dict['spread']['USDE2'] = file_dict['yields']['US2'] - file_dict['yields']['DE2']
file_dict['spread']['USDE5'] = file_dict['yields']['US5'] - file_dict['yields']['DE5']
file_dict['spread']['USDE10'] = file_dict['yields']['US10'] - file_dict['yields']['DE10']
file_dict['spread']['USEZINF2'] = file_dict['infswp']['US-2'] - file_dict['infswp']['EZ-2']
file_dict['spread']['USEZINF5'] = file_dict['infswp']['US-5'] - file_dict['infswp']['EZ-5']
file_dict['spread']['USEZINF10'] = file_dict['infswp']['US-10'] - file_dict['infswp']['EZ-10']
file_dict['spread']['USINF102'] = file_dict['infswp']['US-10'] - file_dict['infswp']['US-2']
file_dict['spread']['USINF105'] = file_dict['infswp']['US-10'] - file_dict['infswp']['US-5']
file_dict['spread']['USINF52'] = file_dict['infswp']['US-5'] - file_dict['infswp']['US-2']
file_dict['spread']['EZINF102'] = file_dict['infswp']['EZ-10'] - file_dict['infswp']['EZ-2']
file_dict['spread']['EZINF105'] = file_dict['infswp']['EZ-10'] - file_dict['infswp']['EZ-5']
file_dict['spread']['EZINF52'] = file_dict['infswp']['EZ-5'] - file_dict['infswp']['EZ-2']
# STIR futures strictly follow US trading days, so the other dataframe holding spreads (calculated above) will be merged with this one
full_data = full_data.join(file_dict['spread'], how='left')
full_data = full_data.join(file_dict['yields'], how='left')
full_data = full_data.join(file_dict['infswp'], how='left')
full_data.head()
| ER1-2 | SFR1-2 | ER1-3 | SFR1-3 | ER1-4 | SFR1-4 | ER1-5 | SFR1-5 | ER1-6 | SFR1-6 | ... | US10 | US2 | US5 | US30 | EZ-2 | EZ-5 | EZ-10 | US-2 | US-5 | US-10 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2010-01-04 | 0.360 | 0.280 | 0.73 | 0.660 | 1.065 | 1.075 | 1.330 | 1.475 | 1.570 | 1.865 | ... | 3.8155 | 1.0638 | 2.6350 | 4.6431 | 1.6800 | 2.1600 | 2.4410 | 1.5398 | 2.3475 | 2.7818 |
| 2010-01-05 | 0.335 | 0.245 | 0.70 | 0.610 | 1.025 | 1.005 | 1.295 | 1.385 | 1.540 | 1.760 | ... | 3.7608 | 1.0080 | 2.5611 | 4.6091 | 1.7025 | 2.1725 | 2.4575 | 1.5556 | 2.3500 | 2.7937 |
| 2010-01-06 | 0.330 | 0.225 | 0.69 | 0.585 | 1.015 | 0.990 | 1.290 | 1.385 | 1.545 | 1.770 | ... | 3.8215 | 0.9920 | 2.5913 | 4.6875 | 1.7300 | 2.1950 | 2.4880 | 1.5811 | 2.4050 | 2.8445 |
| 2010-01-07 | 0.310 | 0.225 | 0.66 | 0.590 | 0.985 | 1.005 | 1.260 | 1.415 | 1.525 | 1.810 | ... | 3.8235 | 1.0240 | 2.6115 | 4.6855 | 1.7650 | 2.2200 | 2.4880 | 1.6155 | 2.3545 | 2.8551 |
| 2010-01-08 | 0.300 | 0.190 | 0.64 | 0.520 | 0.955 | 0.925 | 1.230 | 1.330 | 1.500 | 1.735 | ... | 3.8297 | 0.9759 | 2.5912 | 4.7150 | 1.7600 | 2.2050 | 2.4670 | 1.6061 | 2.3920 | 2.8490 |
5 rows × 184 columns
PCA for EZ CDS¶
# this function will be leveraged in the work-forward analysis
# to test the Italian-German 10-year spread
def make_pca(df, beg, end):
'''
Function creates a risk index from PCA loadings.
'''
cds = df.ffill().iloc[beg:end].diff().dropna().values
scaler = StandardScaler()
scaled_cds = scaler.fit_transform(cds)
pca = PCA(n_components=3)
pc = pca.fit_transform(scaled_cds)
# multiply PCA loadings with scaled underlying data
scaled_cds_array = np.array(scaled_cds)
scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)
# multiply "scaled loadings" with principal component weights, respectively
# to create one index that combines all three principal components
# this index had to be negated to make it comparable with underlying CDS data
EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)
EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(), index=file_dict['ez_cds'].loc[str(beg):str(end)].index[1:])
EZ_risk.columns = ['EZ Risk Index']
return EZ_risk
Use PCA to identify the top principal components (PCs). Plot a chart of explained variance and a biplot showing how the original data maps onto the first 2 PCs.
cds = file_dict['ez_cds'].ffill().loc[str(beg_yr):str(end_yr)].diff().dropna().values
scaler = StandardScaler()
scaled_cds = scaler.fit_transform(cds)
pca = PCA(n_components=3)
pc = pca.fit_transform(scaled_cds)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.cumsum())
plt.bar([1,2,3], pca.explained_variance_ratio_.cumsum())
plt.plot([1,2,3], pca.explained_variance_ratio_, color='red', marker='o')
plt.legend(['Individual Variance', 'Cumulative Variance'])
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance % (1=100%)')
plt.ylim([0,1])
plt.xticks(np.arange(1, 3.1, step=1))
plt.show()
[0.81588054 0.05860003 0.04874604] [0.81588054 0.87448057 0.92322662]
# visualize the loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2', 'PC3'], index=file_dict['ez_cds'].columns)
display(loadings)
def loading_plot(coeff, labels):
n = coeff.shape[0]
for i in range(n):
plt.arrow(0, 0, coeff[i,0], coeff[i,1], head_width = 0.05, head_length = 0.05, color = '#21918C',alpha = 0.5)
plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = '#21918C', ha = 'center', va = 'center')
plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.grid()
fig, ax = plt.subplots(figsize = (7,4))
loading_plot(pca.components_.T, file_dict['ez_cds'].columns)
plt.show()
| PC1 | PC2 | PC3 | |
|---|---|---|---|
| CS | -0.341493 | 0.319370 | 0.515410 |
| UBS | -0.339588 | 0.367801 | 0.469703 |
| Unicredit | -0.362522 | -0.041834 | -0.334119 |
| BancoSantander | -0.353965 | -0.544182 | 0.199138 |
| BNP | -0.360829 | 0.245916 | -0.315533 |
| Intesa | -0.360868 | -0.045345 | -0.394348 |
| SocGen | -0.356120 | 0.284210 | -0.267641 |
| BBVA | -0.352283 | -0.567040 | 0.189044 |
# multiply PCA loadings with scaled underlying data
scaled_cds_array = np.array(scaled_cds)
scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)
# multiply "scaled loadings" with principal component weights, respectively
# to create one index that combines all three principal components
# this index had to be negated to make it comparable with underlying CDS data
EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)
The risk index created using the PCA loadings and scaled CDS data aligns well with the underlying CDS data, suggesting it is capturing a large amount of the common variation.
file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].plot(legend=True)
plt.title('5-Year CDS of Eurozone Banks')
plt.xlabel('')
plt.ylabel('Basis Points')
plt.show()
plt.plot(file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].index[1:], EZ_risk_index.cumsum(),
label='EZ Risk Index', color='k')
plt.title('5-Year CDS of Eurozone Banks vs. EZ Risk Index')
plt.ylabel('Basis Points')
plt.legend()
plt.plot(file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)])
plt.show()
# create PCA index and add to larger datafile
EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(), index=file_dict['ez_cds'].loc[str(beg_yr):str(end_yr)].index[1:])
EZ_risk.columns = ['EZ Risk Index']
full_data = full_data.join(EZ_risk, how='left')
full_data.head()
| ER1-2 | SFR1-2 | ER1-3 | SFR1-3 | ER1-4 | SFR1-4 | ER1-5 | SFR1-5 | ER1-6 | SFR1-6 | ... | US2 | US5 | US30 | EZ-2 | EZ-5 | EZ-10 | US-2 | US-5 | US-10 | EZ Risk Index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2010-01-04 | 0.360 | 0.280 | 0.73 | 0.660 | 1.065 | 1.075 | 1.330 | 1.475 | 1.570 | 1.865 | ... | 1.0638 | 2.6350 | 4.6431 | 1.6800 | 2.1600 | 2.4410 | 1.5398 | 2.3475 | 2.7818 | -4.918199 |
| 2010-01-05 | 0.335 | 0.245 | 0.70 | 0.610 | 1.025 | 1.005 | 1.295 | 1.385 | 1.540 | 1.760 | ... | 1.0080 | 2.5611 | 4.6091 | 1.7025 | 2.1725 | 2.4575 | 1.5556 | 2.3500 | 2.7937 | -10.947791 |
| 2010-01-06 | 0.330 | 0.225 | 0.69 | 0.585 | 1.015 | 0.990 | 1.290 | 1.385 | 1.545 | 1.770 | ... | 0.9920 | 2.5913 | 4.6875 | 1.7300 | 2.1950 | 2.4880 | 1.5811 | 2.4050 | 2.8445 | -15.339995 |
| 2010-01-07 | 0.310 | 0.225 | 0.66 | 0.590 | 0.985 | 1.005 | 1.260 | 1.415 | 1.525 | 1.810 | ... | 1.0240 | 2.6115 | 4.6855 | 1.7650 | 2.2200 | 2.4880 | 1.6155 | 2.3545 | 2.8551 | -20.284589 |
| 2010-01-08 | 0.300 | 0.190 | 0.64 | 0.520 | 0.955 | 0.925 | 1.230 | 1.330 | 1.500 | 1.735 | ... | 0.9759 | 2.5912 | 4.7150 | 1.7600 | 2.2050 | 2.4670 | 1.6061 | 2.3920 | 2.8490 | -23.648763 |
5 rows × 185 columns
EDA¶
# convert dataset to weekly at Friday observations
full_data_wf = full_data.resample('W-FRI').last()
full_data_wf.head()
| ER1-2 | SFR1-2 | ER1-3 | SFR1-3 | ER1-4 | SFR1-4 | ER1-5 | SFR1-5 | ER1-6 | SFR1-6 | ... | US2 | US5 | US30 | EZ-2 | EZ-5 | EZ-10 | US-2 | US-5 | US-10 | EZ Risk Index | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2010-01-08 | 0.300 | 0.190 | 0.640 | 0.520 | 0.955 | 0.925 | 1.230 | 1.330 | 1.500 | 1.735 | ... | 0.9759 | 2.5912 | 4.7150 | 1.7600 | 2.2050 | 2.4670 | 1.6061 | 2.3920 | 2.8490 | -23.648763 |
| 2010-01-15 | 0.255 | 0.135 | 0.555 | 0.425 | 0.835 | 0.800 | 1.080 | 1.185 | 1.335 | 1.560 | ... | 0.8621 | 2.4159 | 4.5814 | 1.7000 | 2.1030 | 2.3680 | 1.5670 | 2.2750 | 2.7300 | -6.161920 |
| 2010-01-22 | 0.225 | 0.110 | 0.500 | 0.370 | 0.780 | 0.730 | 1.035 | 1.100 | 1.285 | 1.470 | ... | 0.7875 | 2.3414 | 4.5312 | 1.6995 | 2.0470 | 2.3110 | 1.5878 | 2.2090 | 2.6706 | 25.244170 |
| 2010-01-29 | 0.265 | 0.115 | 0.550 | 0.355 | 0.820 | 0.700 | 1.060 | 1.080 | 1.290 | 1.460 | ... | 0.8118 | 2.3232 | 4.4884 | 1.5100 | 1.8950 | 2.2982 | 1.5722 | 2.2440 | 2.7591 | 33.599325 |
| 2010-02-05 | 0.235 | 0.105 | 0.445 | 0.305 | 0.660 | 0.610 | 0.860 | 0.950 | 1.070 | 1.300 | ... | 0.7633 | 2.2333 | 4.5186 | 1.4375 | 1.8125 | 2.1925 | 1.4932 | 2.1355 | 2.6228 | 57.463195 |
5 rows × 185 columns
Correlations¶
Intuitively, the various permutations of spreads calculated for the STIR futures above will have a high level of correlation amongst themselves.
As a result, the correlation heatmaps below focused on the yield spreads.
spreads_cols = file_dict['spread'].columns
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].corr(method='pearson').round(2))
plt.title('Correlation on levels ('+str(beg_yr)+'-'+str(end_yr)+')')
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson').round(2))
plt.title('Correlation on first difference ('+str(beg_yr)+'-'+str(end_yr)+')')
plt.show()
Feature Selection via Correlations¶
The absolute value of pearson correlation is used to select the top securities whose values are above 0.7 (with a maximum of 10 features per target variable). This process will occur every time the model is updated during the walk-forward backtest.
target_list = []
for col in full_data_wf.columns:
if col in file_dict['ed'].columns:
continue
elif col in file_dict['sofr'].columns:
continue
elif col in file_dict['yields'].columns:
continue
elif col in file_dict['ez_cds'].columns:
continue
elif col in file_dict['infswp'].columns:
continue
elif col == 'EZ Risk Index':
continue
else:
target_list.append(col)
#target_list = ['US102','DE52','USDE10','SFR4-8','SFR6-12','ER4-8','ER6-12','USDE5',
# 'SFR2-8','ER2-8','ITDE10']
#target_list = spreads_cols.copy()
feature_dict = dict()
for tgt in target_list:
_corr = full_data_wf.loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson')[tgt]
feature_dict[tgt] = _corr.abs().sort_values(ascending=False).where(lambda x: x > 0.7).dropna().index.tolist()[1:11]
The table below shows the target as column headings and the features (in the rows) that have the highest correlation (either positive or negative).
#feat_df = pd.DataFrame.from_dict(feature_list)
feat_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in feature_dict.items() ]))
display(HTML(feat_df[spreads_cols].to_html()))
| US302 | US305 | US3010 | US102 | US105 | US52 | DE302 | DE305 | DE3010 | DE102 | DE105 | DE52 | CA102 | CA105 | CA52 | ITDE10 | USDE2 | USDE5 | USDE10 | USEZINF2 | USEZINF5 | USEZINF10 | USINF102 | USINF105 | USINF52 | EZINF102 | EZINF105 | EZINF52 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | US30 | US3010 | US305 | US10 | US305 | US5 | DE102 | DE302 | DE305 | DE302 | DE305 | DE102 | CA52 | NaN | CA102 | IT10 | SFRER4 | SFRER8 | USDE5 | US-2 | NaN | NaN | USINF52 | NaN | USINF102 | EZINF52 | NaN | EZINF102 |
| 1 | US105 | US105 | NaN | US30 | US302 | US10 | DE305 | DE105 | NaN | DE52 | DE102 | NaN | NaN | NaN | NaN | NaN | DE2 | SFRER10 | NaN | NaN | NaN | NaN | US-2 | NaN | US-2 | EZ-2 | NaN | NaN |
| 2 | US102 | US302 | NaN | US302 | NaN | SFR2-12 | DE105 | DE3010 | NaN | DE105 | DE302 | NaN | NaN | NaN | NaN | NaN | SFRER6 | USDE10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | US305 | NaN | NaN | US52 | NaN | SFR1-12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | USDE5 | SFRER6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | UK10 | NaN | SFR4-12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | DE5 | USDE2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | NaN | NaN | NaN | SFR8-12 | NaN | SFR3-12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SFRER12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | NaN | NaN | NaN | SFR7-12 | NaN | SFR5-12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 7 | NaN | NaN | NaN | SFR9-12 | NaN | SFR4-11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | NaN | NaN | NaN | SFR6-12 | NaN | US102 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9 | NaN | NaN | NaN | NaN | NaN | SFR5-11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Dendrogram¶
Hierarchical clustering is used to group stocks that have higher correlations. This approach measures the "distance" between values, which is defined as the distance between correlation coefficients. The farther the distance, then the more different (or dissimilar) the two values are from each other.
Given a matrix of correlation coefficients, the distance will be calculated as: $\sqrt{0.5 * ( 1 - \rho_{i,j})}$
The dendrogram below has clustered the stocks based on this distance measure. The x-axis labels are the sorted columns from the correlation matrix.
tgt_corr = full_data_wf[spreads_cols].loc[str(beg_yr):str(end_yr)].diff().corr(method='pearson').round(2).corr()
fig, ax = plt.subplots(figsize=(12, 5))
distance = np.sqrt(0.5 * (1 - tgt_corr))
Z = linkage(squareform(distance), 'complete')
dendrogram(Z, labels=tgt_corr.columns, orientation='top', leaf_rotation=90)
x_lab = ax.get_xticklabels()
plt.show()
Plots of raw data¶
Given the high number of features, only some will be charted for illustration.
# Plot example of elevated term spread during periods of high macro risk
# like the Italian-German 10-year spread during the Eurozone debt crisis in 2011
plt.figure(figsize=(8,3))
full_data_wf['ITDE10'].plot(title='ITDE10')
plt.ylabel('Spread in %')
plt.xlabel('')
plt.show()
for c, col in enumerate(spreads_cols):
if c % 10 == 0:
plt.figure(figsize=(8,3))
full_data_wf[col].plot(title=col)
plt.ylabel('Spread in %')
plt.xlabel('')
plt.show()
#for c, col in enumerate(full_data_wf.drop(spreads_cols, axis=1).columns):
# if c % 15 == 0:
# plt.figure(figsize=(8,3))
# full_data_wf[col].plot(title=col)
# plt.ylabel('Spread in %')
# plt.xlabel('')
# plt.show()
Check for stationarity¶
def adf_test(x):
dftest = adfuller(x.dropna(),autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Numer of Observations Used'])
print(dfoutput.round(3))
def run_eda(df):
for col in df.columns:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12,3))
df[col].plot(title=col, ax=axes[0])
sns.histplot(ax=axes[1], data=df,x=col)
axes[1].set_title('Distribution')
plot_acf(df[col].dropna(), ax=axes[2])
axes[2].set_title('Autocorrelation')
plt.tight_layout()
for col in df.columns:
print('Running ADF on '+col+'\n')
adf_test(df[col])
print('----------------------------------------')
Run ADF on all the target variables. While the target does not need to be stationary (since it will be converted to a categorical variable), the ADF will help inform if lags of the target should be included in the feature dataset.
The results suggest all target variables have serial correlation and are not stationary.
Note that the data was contained to 2010-2015 to avoid look ahead bias.
for tgt in target_list:
run_eda(full_data_wf[[tgt]].loc[str(beg_yr) : str(end_yr)].diff()[1:])
Running ADF on ER1-2 Test Statistic -10.509 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-2 Test Statistic -9.175 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on ER1-3 Test Statistic -10.479 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-3 Test Statistic -9.275 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on ER1-4 Test Statistic -10.516 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-4 Test Statistic -13.519 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-5 Test Statistic -10.293 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-5 Test Statistic -13.078 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-6 Test Statistic -10.145 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-6 Test Statistic -12.903 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-7 Test Statistic -10.086 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-7 Test Statistic -12.876 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-8 Test Statistic -10.18 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on SFR1-8 Test Statistic -12.934 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-9 Test Statistic -10.215 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-9 Test Statistic -12.981 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-10 Test Statistic -10.322 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-10 Test Statistic -13.041 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER1-11 Test Statistic -10.342 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-11 Test Statistic -13.077 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ----------------------------------------
<ipython-input-24-74079bfd30a4>:8: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). Consider using `matplotlib.pyplot.close()`. fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12,3))
Running ADF on ER1-12 Test Statistic -10.401 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR1-12 Test Statistic -13.199 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER2-3 Test Statistic -10.727 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-3 Test Statistic -13.641 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER2-4 Test Statistic -10.422 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-4 Test Statistic -13.18 p-value 0.00 #Lags Used 0.00 Numer of Observations Used 154.00 dtype: float64 ---------------------------------------- Running ADF on ER2-5 Test Statistic -9.941 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-5 Test Statistic -12.732 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER2-6 Test Statistic -9.753 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-6 Test Statistic -12.683 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER2-7 Test Statistic -12.539 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-7 Test Statistic -12.778 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER2-8 Test Statistic -12.918 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-8 Test Statistic -8.135 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER2-9 Test Statistic -13.345 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-9 Test Statistic -8.091 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER2-10 Test Statistic -13.689 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-10 Test Statistic -8.112 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER2-11 Test Statistic -13.743 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-11 Test Statistic -8.171 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER2-12 Test Statistic -13.789 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR2-12 Test Statistic -5.896 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on ER3-4 Test Statistic -10.006 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-4 Test Statistic -12.782 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER3-5 Test Statistic -12.756 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-5 Test Statistic -7.179 p-value 0.000 #Lags Used 3.000 Numer of Observations Used 151.000 dtype: float64 ---------------------------------------- Running ADF on ER3-6 Test Statistic -13.111 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-6 Test Statistic -7.056 p-value 0.000 #Lags Used 3.000 Numer of Observations Used 151.000 dtype: float64 ---------------------------------------- Running ADF on ER3-7 Test Statistic -13.173 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-7 Test Statistic -7.832 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER3-8 Test Statistic -13.487 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-8 Test Statistic -7.81 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on ER3-9 Test Statistic -13.882 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-9 Test Statistic -7.804 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER3-10 Test Statistic -14.084 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-10 Test Statistic -7.851 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER3-11 Test Statistic -13.971 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-11 Test Statistic -7.94 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on ER3-12 Test Statistic -13.848 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR3-12 Test Statistic -5.845 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on ER4-5 Test Statistic -13.529 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-5 Test Statistic -7.274 p-value 0.000 #Lags Used 3.000 Numer of Observations Used 151.000 dtype: float64 ---------------------------------------- Running ADF on ER4-6 Test Statistic -13.722 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-6 Test Statistic -5.405 p-value 0.000 #Lags Used 4.000 Numer of Observations Used 150.000 dtype: float64 ---------------------------------------- Running ADF on ER4-7 Test Statistic -13.617 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-7 Test Statistic -7.615 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER4-8 Test Statistic -13.941 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-8 Test Statistic -7.618 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER4-9 Test Statistic -14.273 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-9 Test Statistic -7.641 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER4-10 Test Statistic -14.305 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-10 Test Statistic -7.712 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER4-11 Test Statistic -10.373 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-11 Test Statistic -7.834 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER4-12 Test Statistic -10.381 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR4-12 Test Statistic -7.911 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER5-6 Test Statistic -13.701 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-6 Test Statistic -5.116 p-value 0.000 #Lags Used 4.000 Numer of Observations Used 150.000 dtype: float64 ---------------------------------------- Running ADF on ER5-7 Test Statistic -13.662 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-7 Test Statistic -7.482 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER5-8 Test Statistic -14.151 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-8 Test Statistic -7.554 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER5-9 Test Statistic -14.472 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-9 Test Statistic -7.61 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on ER5-10 Test Statistic -10.526 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-10 Test Statistic -7.727 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER5-11 Test Statistic -10.547 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR5-11 Test Statistic -7.891 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER5-12 Test Statistic -10.47 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on SFR5-12 Test Statistic -5.919 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on ER6-7 Test Statistic -13.49 p-value 0.00 #Lags Used 0.00 Numer of Observations Used 154.00 dtype: float64 ---------------------------------------- Running ADF on SFR6-7 Test Statistic -7.693 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER6-8 Test Statistic -14.554 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFR6-8 Test Statistic -7.722 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER6-9 Test Statistic -10.613 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR6-9 Test Statistic -7.766 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER6-10 Test Statistic -10.612 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR6-10 Test Statistic -7.928 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER6-11 Test Statistic -10.525 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR6-11 Test Statistic -8.131 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER6-12 Test Statistic -10.398 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR6-12 Test Statistic -8.246 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER7-8 Test Statistic -10.44 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on SFR7-8 Test Statistic -7.941 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER7-9 Test Statistic -10.776 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR7-9 Test Statistic -7.994 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER7-10 Test Statistic -10.58 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on SFR7-10 Test Statistic -8.265 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER7-11 Test Statistic -10.504 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR7-11 Test Statistic -8.527 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER7-12 Test Statistic -10.335 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR7-12 Test Statistic -15.165 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER8-9 Test Statistic -11.058 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR8-9 Test Statistic -8.207 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on ER8-10 Test Statistic -10.709 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR8-10 Test Statistic -15.382 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER8-11 Test Statistic -10.59 p-value 0.00 #Lags Used 1.00 Numer of Observations Used 153.00 dtype: float64 ---------------------------------------- Running ADF on SFR8-11 Test Statistic -15.389 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER8-12 Test Statistic -10.395 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR8-12 Test Statistic -15.506 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER9-10 Test Statistic -10.041 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR9-10 Test Statistic -15.419 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER9-11 Test Statistic -10.327 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR9-11 Test Statistic -15.539 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ER9-12 Test Statistic -10.199 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR9-12 Test Statistic -4.744 p-value 0.000 #Lags Used 5.000 Numer of Observations Used 149.000 dtype: float64 ---------------------------------------- Running ADF on ER10-11 Test Statistic -10.194 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR10-11 Test Statistic -4.811 p-value 0.000 #Lags Used 7.000 Numer of Observations Used 147.000 dtype: float64 ---------------------------------------- Running ADF on ER10-12 Test Statistic -10.176 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR10-12 Test Statistic -4.919 p-value 0.000 #Lags Used 7.000 Numer of Observations Used 147.000 dtype: float64 ---------------------------------------- Running ADF on ER11-12 Test Statistic -10.022 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on SFR11-12 Test Statistic -15.391 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFRER4 Test Statistic -13.316 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFRER6 Test Statistic -13.083 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on SFRER8 Test Statistic -8.515 p-value 0.000 #Lags Used 3.000 Numer of Observations Used 151.000 dtype: float64 ---------------------------------------- Running ADF on SFRER10 Test Statistic -8.183 p-value 0.000 #Lags Used 3.000 Numer of Observations Used 151.000 dtype: float64 ---------------------------------------- Running ADF on SFRER12 Test Statistic -13.338 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on US302 Test Statistic -15.346 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on US305 Test Statistic -13.64 p-value 0.00 #Lags Used 0.00 Numer of Observations Used 154.00 dtype: float64 ---------------------------------------- Running ADF on US3010 Test Statistic -12.648 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on US102 Test Statistic -8.204 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on US105 Test Statistic -14.196 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on US52 Test Statistic -6.032 p-value 0.000 #Lags Used 2.000 Numer of Observations Used 152.000 dtype: float64 ---------------------------------------- Running ADF on DE302 Test Statistic -10.914 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on DE305 Test Statistic -10.856 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on DE3010 Test Statistic -10.591 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on DE102 Test Statistic -4.594 p-value 0.000 #Lags Used 5.000 Numer of Observations Used 149.000 dtype: float64 ---------------------------------------- Running ADF on DE105 Test Statistic -12.363 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on DE52 Test Statistic -8.98 p-value 0.00 #Lags Used 2.00 Numer of Observations Used 152.00 dtype: float64 ---------------------------------------- Running ADF on CA102 Test Statistic -10.887 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on CA105 Test Statistic -14.772 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on CA52 Test Statistic -13.536 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on ITDE10 Test Statistic -13.033 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USDE2 Test Statistic -12.542 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USDE5 Test Statistic -4.988 p-value 0.000 #Lags Used 8.000 Numer of Observations Used 146.000 dtype: float64 ---------------------------------------- Running ADF on USDE10 Test Statistic -7.119 p-value 0.000 #Lags Used 4.000 Numer of Observations Used 150.000 dtype: float64 ---------------------------------------- Running ADF on USEZINF2 Test Statistic -13.366 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USEZINF5 Test Statistic -15.032 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USEZINF10 Test Statistic -17.683 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USINF102 Test Statistic -11.303 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on USINF105 Test Statistic -4.939 p-value 0.000 #Lags Used 7.000 Numer of Observations Used 147.000 dtype: float64 ---------------------------------------- Running ADF on USINF52 Test Statistic -10.194 p-value 0.000 #Lags Used 0.000 Numer of Observations Used 154.000 dtype: float64 ---------------------------------------- Running ADF on EZINF102 Test Statistic -10.929 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ---------------------------------------- Running ADF on EZINF105 Test Statistic -4.881 p-value 0.000 #Lags Used 6.000 Numer of Observations Used 148.000 dtype: float64 ---------------------------------------- Running ADF on EZINF52 Test Statistic -12.958 p-value 0.000 #Lags Used 1.000 Numer of Observations Used 153.000 dtype: float64 ----------------------------------------
Model Selection & Backtest¶
Test various classification models (i.e., SVC, Logit, Random Forest) to see which model best predicts an up or down move in a position in a walk-forward analysis.
DV01 = 10000
aum = 1000000
look_back = 3
beg = 0
end = beg+(52*look_back)
# capture positions for each trade to be used in portfolio construction later
position_dict = dict()
def get_features (df, beg, end, target):
'''
Function takes a pandas dataframe and calculates a correlation matrix
Using the absolute value of the correlation coefficient, the matrix is sorted
and filtered to only show correlations above 0.7 up to maximum of 10 relationships.
Returns the names of the securities with high correlations.
'''
_corr = df.iloc[beg:end].diff().corr(method='pearson')[target].abs()
_feats = _corr.abs().sort_values(by=target, ascending=False).where(lambda x: x > 0.7).dropna().index.tolist()[1:11]
return _feats
def lag_feature(df, features, target):
'''
Function takes a pandas dataframe and inserts lags of all columns.
Returns a dataframe with all lags inserted, first differenced,
and a target series as a categorical variable.
'''
df = df[features + target].ffill().copy()
tmp_dict = dict()
for col in df.columns:
for l in range(1,5):
tmp_col = df[col].shift(l).copy()
tmp_dict[col+'L'+str(l)] = tmp_col
df = pd.concat([df,
pd.DataFrame.from_dict(tmp_dict)],
axis=1
)
df[target] = df[target].shift(-1)
df.dropna(inplace=True)
X = df.diff()[1:].ffill().drop(target, axis=1)
y = df[target].diff()[1:].ffill()
upper = 0.03 # +0.03% or up 3 basis points
lower = -0.03 # -0.03% or down 3 basis points
y = pd.DataFrame(np.where(y > upper, 1, np.where(y < lower, -1, 0)))
return X, y
def filter(X, y, beg, end):
'''
Function takes the target and features, as well as
beginning and ending index values as integers.
Returns target and features over selected time period.
'''
X = X.iloc[beg : end].copy()
y = pd.DataFrame(y).iloc[beg : end].values
return X, y
def train_logit(X, y):
'''
Function takes a feature dataframe and target series,
and returns a logistic regression model.
'''
log = LogisticRegression(solver='saga', multi_class='multinomial')
log.fit(X, np.reshape(y,-1))
return log
def valid_logit(X, y, model):
'''
Function takes a feature dataframe, target series, and logistic regression
model, and returns the model score.
'''
y_pred = model.predict(X)
score = model.score(X, y)
return score
def train_valid_rfc(train_X, train_y, valid_X, valid_y):
'''
Function takes a feature dataframe and target series for both the training
period and validation period.
A Random Forest Classifier model is trained with some hyperparameter tuning.
The best model is chosen with the highest F1 score using the validation data.
The function returns the best model, score, and number of estimators.
'''
best_n = 0
best_score = 0
for i in range(100, 300, 50):
rfc = RandomForestClassifier(random_state=51, n_estimators=i)
rfc.fit(train_X, np.reshape(train_y,-1))
y_pred = rfc.predict(valid_X)
tmpf1 = metrics.f1_score(valid_y, y_pred, average='weighted')
if tmpf1 > best_score:
best_score = tmpf1
best_n = i
best_model = rfc
return best_model, best_score, best_n
def train_valid_svc(train_X, train_y, valid_X, valid_y):
'''
Function takes a feature dataframe and target series for both the training
period and validation period.
A Support Vector Classification model is trained with some hyperparameter
tuning. The best model is chosen with the highest F1 score using the
validation data.
The function returns the best model, score, and regularization parameter.
'''
best_n = 0
best_score = 0
for i in range(1, 1000, 50):
svc_model = SVC(random_state=51, C=i, kernel='rbf', gamma='auto')
svc_model.fit(train_X, np.reshape(train_y,-1))
y_pred = svc_model.predict(valid_X)
tmpf1 = metrics.f1_score(valid_y, y_pred, average='weighted')
if tmpf1 > best_score:
best_score = tmpf1
best_n = i
best_model = svc_model
return best_model, best_score, best_n
def get_bt_stats(_dict, target, DV01, aum):
'''
Show backtest results
'''
positions = pd.DataFrame.from_dict(_dict, orient='index')
positions.columns = ['Position']
positions = positions.join(full_data_wf[target].diff().loc[positions.index], how='left')
positions['Position'] = positions['Position'].shift(1).fillna(0)
positions['cum_position'] = positions['Position']
for r in range(1, positions.shape[0]):
if positions['cum_position'].iloc[r-1] == 1 and positions['Position'].iloc[r] == 0:
positions['cum_position'].iloc[r] = 1
if positions['cum_position'].iloc[r-1] == -1 and positions['Position'].iloc[r] == 0:
positions['cum_position'].iloc[r] = -1
positions['trade'] = positions[['cum_position']] * positions[target].values
positions['pnl'] = positions['trade'] * 100 * DV01
positions['pnl'].iloc[0] = aum
positions['cum_pnl'] = positions['pnl'].cumsum()
positions['return'] = positions['cum_pnl'].pct_change()
positions['neg_return'] = positions['return'].where(positions['return'] < 0, np.nan)
positions['cum_return'] = np.cumprod(1 + positions['return'].fillna(0).values)
positions['rolling_max'] = positions['cum_pnl'].cummax()
positions['weekly_drawdown'] = positions['cum_pnl']/positions['rolling_max'] - 1.0
positions['max_drawdown'] = positions['weekly_drawdown'].cummin()
for r in range(2, positions.shape[0]):
if round(positions['cum_return'].iloc[r-1],2) <= 0.0:
positions['cum_return'].iloc[r-1] = 0.
positions['cum_return'].iloc[r] = 0.
else:
continue
# show stats
_m = positions['return'].mean() * 100 * 52
_s = positions['return'].std() * 100 * np.sqrt(52)
_negs = positions['neg_return'].std() * 100 * np.sqrt(52)
print('-----------------------------------------------------------------')
print('Backtest Results for '+target[0])
print('Annualized Mean Return %:', round(_m,2))
print('Annualized St Deviation %:', round(_s,2))
print('Sharpe Ratio assuming rf = 0:', round(_m/_s,2))
print('Max Drawdown %:', round(positions['max_drawdown'].min()*100,2))
print('Cumulative Return %:', round(positions['cum_return'].iloc[-1]*100,2))
print('Sortino Ratio assuming rf = 0:', round(_m/_negs, 2))
print('Calmar Ratio', round(_m/(-positions['max_drawdown'].min()*100),2))
print('-----------------------------------------------------------------')
# show cumulative return
plt.figure(figsize=(8,4))
plt.plot(positions['cum_return'])
#plt.plot(positions['cum_pnl']/positions['cum_pnl'][0])
plt.title('Cumulative Return Multiple on US $1,000,000 for '+target[0])
plt.xlabel('Date')
plt.grid(True)
plt.show()
return positions
Target 1: Euribor 6th contract - 12th contract (3Y-1.5Y Yield Curve)¶
target_1 = ['ER6-12']
target_1_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_1
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_1_features,
target_1
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end
)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52
)
score = valid_logit(valid_X,
valid_y,
log_model
)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.31 Best F1 score for RFC: 0.33 Best number of estimators: 250 Best F1 score for SVC: 0.26 Best C for SVC: 651
# Uncomment to run code to generate tree splits
# an example of how Random Forest splits the data
#for i in range(1):
# rf_tree = rfc_model.estimators_[i]
# rf_data = export_graphviz(rf_tree,
# feature_names=valid_X.columns,
# filled=True,
# max_depth=2,
# impurity=False,
# proportion=True)
# rf_graph = graphviz.Source(rf_data)
# display(rf_graph)
Walk-forward analysis¶
target_1_wf_dict = dict() # dictionary to save scores
target_1_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_1_features,
target_1
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_1_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_1
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_1_features,
target_1
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_1_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1))
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_1_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_1[0]] = get_bt_stats(target_1_position, target_1, DV01, aum)
----------------------------------------------------------------- Backtest Results for ER6-12 Annualized Mean Return %: 15.85 Annualized St Deviation %: 20.64 Sharpe Ratio assuming rf = 0: 0.77 Max Drawdown %: -44.59 Cumulative Return %: 339.0 Sortino Ratio assuming rf = 0: 1.09 Calmar Ratio 0.36 -----------------------------------------------------------------
Target 2:Canada 10Y-2Y Yield Curve¶
target_2 = ['CA102']
target_2_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_2
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_2_features,
target_2
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.4 Best F1 score for RFC: 0.37 Best number of estimators: 250 Best F1 score for SVC: 0.47 Best C for SVC: 651
Walk-forward analysis¶
target_2_wf_dict = dict() # dictionary to save scores
target_2_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_2_features,
target_2
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_2_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_2
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_2_features,
target_2
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_2_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1))
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_2_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_2[0]] = get_bt_stats(target_2_position, target_2, DV01, aum)
----------------------------------------------------------------- Backtest Results for CA102 Annualized Mean Return %: 16.37 Annualized St Deviation %: 35.79 Sharpe Ratio assuming rf = 0: 0.46 Max Drawdown %: -72.37 Cumulative Return %: 241.6 Sortino Ratio assuming rf = 0: 0.61 Calmar Ratio 0.23 -----------------------------------------------------------------
Target 3: US 5Y-2Y Yield Curve¶
target_3 = ['US52']
target_3_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_3
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_3_features,
target_3
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.29 Best F1 score for RFC: 0.35 Best number of estimators: 150 Best F1 score for SVC: 0.32 Best C for SVC: 251
Walk-forward analysis¶
target_3_wf_dict = dict() # dictionary to save scores
target_3_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_3_features,
target_3
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_3_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_3
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_3_features,
target_3
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_3_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
)
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_3_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_3[0]] = get_bt_stats(target_3_position, target_3, DV01, aum)
----------------------------------------------------------------- Backtest Results for US52 Annualized Mean Return %: 11.97 Annualized St Deviation %: 20.92 Sharpe Ratio assuming rf = 0: 0.57 Max Drawdown %: -37.48 Cumulative Return %: 238.96 Sortino Ratio assuming rf = 0: 0.86 Calmar Ratio 0.32 -----------------------------------------------------------------
Target 4: US 30Y-2Y Yield Curve¶
target_4 = ['US302']
target_4_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_4
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_4_features,
target_4
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.38 Best F1 score for RFC: 0.4 Best number of estimators: 250 Best F1 score for SVC: 0.35 Best C for SVC: 451
Walk-forward analysis¶
target_4_wf_dict = dict() # dictionary to save scores
target_4_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_4_features,
target_4
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_4_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_4
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_4_features,
target_4
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_4_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
)
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_4_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_4[0]] = get_bt_stats(target_4_position, target_4, DV01, aum)
----------------------------------------------------------------- Backtest Results for US302 Annualized Mean Return %: 20.07 Annualized St Deviation %: 24.2 Sharpe Ratio assuming rf = 0: 0.83 Max Drawdown %: -38.5 Cumulative Return %: 460.03 Sortino Ratio assuming rf = 0: 1.43 Calmar Ratio 0.52 -----------------------------------------------------------------
Target 5: US SOFR 4th contract - 8th contract (2Y-1Y Yield Curve)¶
target_5 = ['SFR4-8']
target_5_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_5
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_5_features,
target_5
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.42 Best F1 score for RFC: 0.42 Best number of estimators: 100 Best F1 score for SVC: 0.46 Best C for SVC: 151
Walk-forward analysis¶
target_5_wf_dict = dict() # dictionary to save scores
target_5_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_5_features,
target_5
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_5_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_5
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_5_features,
target_5
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_5_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1))
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_5_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_5[0]] = get_bt_stats(target_5_position, target_5, DV01, aum)
----------------------------------------------------------------- Backtest Results for SFR4-8 Annualized Mean Return %: 19.98 Annualized St Deviation %: 38.94 Sharpe Ratio assuming rf = 0: 0.51 Max Drawdown %: -57.86 Cumulative Return %: 304.0 Sortino Ratio assuming rf = 0: 0.67 Calmar Ratio 0.35 -----------------------------------------------------------------
Target 6: US 10Y-2Y Yield Curve¶
target_6 = ['US102']
target_6_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_6
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_6_features,
target_6
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.35 Best F1 score for RFC: 0.2 Best number of estimators: 250 Best F1 score for SVC: 0.34 Best C for SVC: 751
Walk-forward analysis¶
target_6_wf_dict = dict() # dictionary to save scores
target_6_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_6_features,
target_6
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_6_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_6
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_6_features,
target_6
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_6_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
)
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_6_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_6[0]] = get_bt_stats(target_6_position, target_6, DV01, aum)
----------------------------------------------------------------- Backtest Results for US102 Annualized Mean Return %: 18.83 Annualized St Deviation %: 16.33 Sharpe Ratio assuming rf = 0: 1.15 Max Drawdown %: -24.88 Cumulative Return %: 474.32 Sortino Ratio assuming rf = 0: 2.09 Calmar Ratio 0.76 -----------------------------------------------------------------
Target 7: German 10Y-2Y Yield Curve¶
target_7 = ['DE102']
target_7_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_7
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_7_features,
target_7
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.37 Best F1 score for RFC: 0.29 Best number of estimators: 100 Best F1 score for SVC: 0.34 Best C for SVC: 301
Walk-forward analysis¶
target_7_wf_dict = dict() # dictionary to save scores
target_7_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_7_features,
target_7
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_7_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_7
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_7_features,
target_7
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_7_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
)
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_7_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_7[0]] = get_bt_stats(target_7_position, target_7, DV01, aum)
----------------------------------------------------------------- Backtest Results for DE102 Annualized Mean Return %: 19.16 Annualized St Deviation %: 16.98 Sharpe Ratio assuming rf = 0: 1.13 Max Drawdown %: -24.96 Cumulative Return %: 483.8 Sortino Ratio assuming rf = 0: 1.76 Calmar Ratio 0.77 -----------------------------------------------------------------
Target 8: German 5Y-2Y Yield Curve¶
target_8 = ['DE52']
target_8_features = get_features(full_data_wf.loc[:'2023'],
beg, end, target_8
)
# Get data
_X, _y = lag_feature(full_data_wf.loc[:'2023'],
target_8_features,
target_8
)
# Run Logit Model
X_, y_ = filter(_X, _y, beg, end)
log_model = train_logit(X_, y_)
valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
Logit model score: 0.37 Best F1 score for RFC: 0.26 Best number of estimators: 150 Best F1 score for SVC: 0.35 Best C for SVC: 1
Walk-forward analysis¶
target_8_wf_dict = dict() # dictionary to save scores
target_8_position = dict() # dictionary to save positions
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_8_features,
target_8
)
for idx in range(0, X.shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_8_features = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_8
)
# Get data
X, y = lag_feature(full_data_wf.loc[:'2023'],
target_8_features,
target_8
)
# Run Logit Model
_X, _y = filter(X, y, beg, end)
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = filter(X, y, _end, _end+52)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_8_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = filter(X, y, _end+52, _end+(52+1)
)
_log_pred = _log_model.predict(_test_X)
_rfc_pred = _rfc_model.predict(_test_X)
_svc_pred = _svc_model.predict(_test_X)
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_8_position[_test_X.index[0]] = _mode[0]
Backtest results¶
position_dict[target_8[0]] = get_bt_stats(target_8_position, target_8, DV01, aum)
----------------------------------------------------------------- Backtest Results for DE52 Annualized Mean Return %: 15.16 Annualized St Deviation %: 30.91 Sharpe Ratio assuming rf = 0: 0.49 Max Drawdown %: -45.62 Cumulative Return %: 253.3 Sortino Ratio assuming rf = 0: 0.72 Calmar Ratio 0.33 -----------------------------------------------------------------
Target 9: Italy 10Y - German 10Y¶
def make_pca(df, beg, end):
cds = df.ffill().iloc[beg : end].diff().dropna().values
scaler = StandardScaler()
scaled_cds = scaler.fit_transform(cds)
pca = PCA(n_components=3)
pc = pca.fit_transform(scaled_cds)
# multiply PCA loadings with scaled underlying data
scaled_cds_array = np.array(scaled_cds)
scaled_loadings = np.dot(scaled_cds_array, pca.components_.T)
# multiply "scaled loadings" with principal component weights, respectively
# to create one index that combines all three principal components
# this index had to be negated to make it comparable with underlying CDS data
EZ_risk_index = -np.dot(scaled_loadings, pca.explained_variance_)
EZ_risk = pd.DataFrame(EZ_risk_index.cumsum(),
index=df.iloc[beg : end].index[1:]
)
EZ_risk.columns = ['EZ Risk Index']
return EZ_risk/100
def lag_feature_pca(df, features_df, features2, target, beg, end, _beg):
'''
Function takes a pandas dataframe and inserts lags of all columns.
Returns a dataframe with all lags inserted, first differenced,
and a target series as a categorical variable.
'''
df_ = df[features2 + target].iloc[beg : end].ffill().copy()
pca_df = make_pca(features_df, beg, end)
df_ = pd.concat([df_, pca_df], axis=1)
tmp_dict = dict()
for col in df_.columns:
for l in range(1,5):
tmp_col = df_[col].shift(l).copy()
tmp_dict[col+'L'+str(l)] = tmp_col
df_ = pd.concat([df_,
pd.DataFrame.from_dict(tmp_dict)],
axis=1
)
df_[target] = df_[target].shift(-1)
df_.dropna(inplace=True)
X = df_.diff()[1:].ffill().iloc[_beg:].drop(target, axis=1)
y = df_[target].diff()[1:].ffill().iloc[_beg:]
upper = 0.03
lower = -0.03
y = pd.DataFrame(np.where(y > upper, 1, np.where(y < lower, -1, 0)))
return X, y
target_9 = ['ITDE10']
target_9_features = file_dict['ez_cds'].resample('W-FRI').last().copy()
target_9_features2 = get_features(full_data_wf.loc[:'2023'],
beg, end, target_9
)
# Get data
X_, y_ = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, beg, end, 0
)
# Run Logit Model
log_model = train_logit(X_, y_)
valid_X, valid_y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, beg, end+52, end
)
#valid_X, valid_y = filter(_X, _y, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
#print(_X)
#print(_y)
Logit model score: 0.33
target_9 = ['ITDE10']
target_9_features = file_dict['ez_cds'].resample('W-FRI').last().copy()
#target_9_features2 = feature_dict[target_9[0]]
target_9_features2 = get_features(full_data_wf.loc[:'2023'],
beg, end, target_9
)
# Note that this process is slightly different given the PCA feature
# Get data
X_, y_ = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, beg, end, 0
)
# Run Logit Model
log_model = train_logit(X_, y_)
valid_X, valid_y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, beg, end+52, -52
)
#valid_X, valid_y = filter(X_, y_, end, end+52)
score = valid_logit(valid_X, valid_y, log_model)
print('Logit model score:', round(score,2))
# Run Random Forest Classification Model
rfc_model, rfc_score, rfc_n = train_valid_rfc(X_, y_, valid_X, valid_y)
print('Best F1 score for RFC:', round(rfc_score,2))
print('Best number of estimators:', rfc_n)
# Run SVC Model
svc_model, svc_score, svc_n = train_valid_svc(X_, y_, valid_X, valid_y)
print('Best F1 score for SVC:', round(svc_score,2))
print('Best C for SVC:', svc_n)
print()
Logit model score: 0.33 Best F1 score for RFC: 0.32 Best number of estimators: 100 Best F1 score for SVC: 0.47 Best C for SVC: 351
Walk-forward analysis¶
target_9_wf_dict = dict() # dictionary to save scores
target_9_position = dict() # dictionary to save positions
# Get data
_X, _y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, 0, 0+(52*look_back), 0
)
for idx in range(0, full_data_wf.loc[:'2023'].shape[0] - (52*(look_back+2))):
_beg = idx
_end = idx + (52*look_back)
if idx % 13 == 0: # refresh models every quarter
# Refresh features via updated correlations
target_9_features2 = get_features(full_data_wf.loc[:'2023'],
_beg, _end, target_9
)
if 'EZ Risk Index' in target_9_features2:
target_9_features2.remove('EZ Risk Index')
# Get data
_X, _y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features, target_9_features2,
target_9, _beg, _end, 0
)
# Run Logit Model
_log_model = train_logit(_X, _y)
_valid_X, _valid_y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, _beg, _end+52, -52
)
_score = valid_logit(_valid_X, _valid_y, _log_model)
# Run Random Forest Classification Model
_rfc_model, _rfc_score, _rfc_n = train_valid_rfc(_X, _y, _valid_X, _valid_y)
# Run SVC Model
_svc_model, _svc_score, _svc_n = train_valid_svc(_X, _y, _valid_X, _valid_y)
# Save the scores indexed by ending year of training period
target_9_wf_dict[X.index[beg]] = [round(_score,2), round(_rfc_score,2), round(_svc_score,2)]
# Forecast
_test_X, _test_y = lag_feature_pca(full_data_wf.loc[:'2023'],
target_9_features,target_9_features2,
target_9, _beg, _end+52+1, -5
)
_log_pred = _log_model.predict(_test_X.iloc[[-1]])
_rfc_pred = _rfc_model.predict(_test_X.iloc[[-1]])
_svc_pred = _svc_model.predict(_test_X.iloc[[-1]])
_mode = multimode([_log_pred[0], _rfc_pred[0], _svc_pred[0]])
if len(_mode) > 1:
_mode = [0]
target_9_position[_test_X.iloc[[-1]].index[0]] = _mode[0]
Backtest results¶
position_dict[target_9[0]] = get_bt_stats(target_9_position, target_9, DV01, aum)
----------------------------------------------------------------- Backtest Results for ITDE10 Annualized Mean Return %: 679.35 Annualized St Deviation %: 2157.53 Sharpe Ratio assuming rf = 0: 0.31 Max Drawdown %: -347.3 Cumulative Return %: 0.0 Sortino Ratio assuming rf = 0: 0.47 Calmar Ratio 1.96 -----------------------------------------------------------------
Portfolio Construction¶
Equal Weight¶
strat_returns = pd.DataFrame()
for k in position_dict.keys():
if k == 'ITDE10': # skip this strategy because backtest results were poor
continue
_ = pd.DataFrame.from_dict(position_dict[k][['return']])
_.columns = [k]
strat_returns = pd.concat([strat_returns, _], axis=1)
# create an equal-weighted portfolio
eq_w_pf = pd.DataFrame(strat_returns.mean(axis=1), columns=['return']).fillna(0)
# calculate performance statistics
eq_w_pf['neg_return'] = eq_w_pf['return'].where(eq_w_pf['return'] < 0, np.nan)
eq_w_pf['cum_return'] = np.cumprod(1 + eq_w_pf['return'].fillna(0).values)
eq_w_pf['rolling_max'] = eq_w_pf['cum_return'].cummax()
eq_w_pf['weekly_drawdown'] = eq_w_pf['cum_return']/eq_w_pf['rolling_max'] - 1.0
eq_w_pf['max_drawdown'] = eq_w_pf['weekly_drawdown'].cummin()
# repeat 0 for return if portfolio wipes out
for r in range(2, eq_w_pf.shape[0]):
if round(eq_w_pf['cum_return'].iloc[r-1],2) <= 0.0:
eq_w_pf['cum_return'].iloc[r-1] = 0.
eq_w_pf['cum_return'].iloc[r] = 0.
else:
continue
eq_w_pf['cum_return'].plot(title='Equal Weighted Portfolio - Cumulative Return')
plt.show()
# show stats
eq_m = eq_w_pf['return'].mean() * 100 * 52
eq_s = eq_w_pf['return'].std() * 100 * np.sqrt(52)
eq_negs = eq_w_pf['neg_return'].std() * 100 * np.sqrt(52)
print('-----------------------------------------------------------------')
print('Backtest Results for Equal Weighted Portfolio')
print('Annualized Mean Return %:', round(eq_m,2))
print('Annualized St Deviation %:', round(eq_s,2))
print('Sharpe Ratio assuming rf = 0:', round(eq_m/eq_s,2))
print('Max Drawdown %:', round(eq_w_pf['max_drawdown'].min()*100,2))
print('Cumulative Return %:', round(eq_w_pf['cum_return'].iloc[-1]*100,2))
print('Sortino Ratio assuming rf = 0:', round(eq_m/eq_negs, 2))
print('Calmar Ratio', round(eq_m/(-eq_w_pf['max_drawdown'].min()*100),2))
print('-----------------------------------------------------------------')
----------------------------------------------------------------- Backtest Results for Equal Weighted Portfolio Annualized Mean Return %: 17.14 Annualized St Deviation %: 14.59 Sharpe Ratio assuming rf = 0: 1.17 Max Drawdown %: -23.6 Cumulative Return %: 418.83 Sortino Ratio assuming rf = 0: 1.72 Calmar Ratio 0.73 -----------------------------------------------------------------
HRP¶
The HRP process is updated on an annual basis, i.e., annual portfolio optimization of weights. The input is the prior year's worth of weekly returns for the eight final spreads assuming a portfolio that was long all of the spreads.
%%capture
# line above is to silence some warning messages from Pandas
hrp_wts = pd.DataFrame()
for year in range(2014,2023):
strategies = full_data_wf[strat_returns.columns].diff().loc[str(year-1)].fillna(0) * 100 * DV01
strategies.iloc[0] = aum
strategies_cum_pnl = strategies.cumsum()
strategies_rets = strategies_cum_pnl.pct_change()[1:]
# Hierarchical risk parity:
# Get initial weights using spread returns from prior year
# assuming a portfolio long all of the 8 spreads
hrp = HRPOpt(strategies_rets)
weights = hrp.optimize()
hrp.portfolio_performance(verbose=False) # minimize printouts
hrp_wts = pd.concat([hrp_wts,
pd.DataFrame.from_dict(weights, orient='index').T], axis=0)
# show how dendrogram changes from beginning to end
if year in [2014, 2022]:
print('Year:',year)
print('Weights:')
print(pd.DataFrame.from_dict(weights,
orient='index',
columns=['Weights']).sort_values(by='Weights',
ascending=False)
)
plotting.plot_dendrogram(hrp) # to plot dendrogram
plt.show()
hrp_wts.set_index([pd.date_range(start=strat_returns.index[0],
end='2023',periods=9)], inplace=True
)
# convert HRP weights from annual to Week at Friday frequency
hrp_wts_wf = hrp_wts.resample('W-FRI').last().ffill()
hrp_wts_wf.head()
| CA102 | DE102 | DE52 | ER6-12 | SFR4-8 | US102 | US302 | US52 | |
|---|---|---|---|---|---|---|---|---|
| 2014-02-07 | 0.195866 | 0.126547 | 0.161098 | 0.108201 | 0.089734 | 0.084339 | 0.08607 | 0.148145 |
| 2014-02-14 | 0.195866 | 0.126547 | 0.161098 | 0.108201 | 0.089734 | 0.084339 | 0.08607 | 0.148145 |
| 2014-02-21 | 0.195866 | 0.126547 | 0.161098 | 0.108201 | 0.089734 | 0.084339 | 0.08607 | 0.148145 |
| 2014-02-28 | 0.195866 | 0.126547 | 0.161098 | 0.108201 | 0.089734 | 0.084339 | 0.08607 | 0.148145 |
| 2014-03-07 | 0.195866 | 0.126547 | 0.161098 | 0.108201 | 0.089734 | 0.084339 | 0.08607 | 0.148145 |
# create HRP-optimized portfolio
hrp_pf = hrp_wts_wf[strat_returns.columns].loc[strat_returns.index].multiply(strat_returns).sum(axis=1)
hrp_pf = pd.DataFrame(hrp_pf, columns=['return'])
# calculate performance statistics
hrp_pf['neg_return'] = hrp_pf['return'].where(hrp_pf['return'] < 0, np.nan)
hrp_pf['cum_return'] = np.cumprod(1 + hrp_pf['return'].fillna(0).values)
hrp_pf['rolling_max'] = hrp_pf['cum_return'].cummax()
hrp_pf['weekly_drawdown'] = hrp_pf['cum_return']/eq_w_pf['rolling_max'] - 1.0
hrp_pf['max_drawdown'] = hrp_pf['weekly_drawdown'].cummin()
# repeat 0 for return if portfolio wipes out
for r in range(2, hrp_pf.shape[0]):
if round(hrp_pf['cum_return'].iloc[r-1],2) <= 0.0:
hrp_pf['cum_return'].iloc[r-1] = 0.
hrp_pf['cum_return'].iloc[r] = 0.
else:
continue
hrp_pf['cum_return'].plot(title='Hierarchical Risk Parity Portfolio - Cumulative Return')
plt.show()
# show stats
hrp_m = hrp_pf['return'].mean() * 100 * 52
hrp_s = hrp_pf['return'].std() * 100 * np.sqrt(52)
hrp_negs = hrp_pf['neg_return'].std() * 100 * np.sqrt(52)
print('-----------------------------------------------------------------')
print('Backtest Results for Hierarchical Risk Parity Portfolio')
print('Annualized Mean Return %:', round(hrp_m,2))
print('Annualized St Deviation %:', round(hrp_s,2))
print('Sharpe Ratio assuming rf = 0:', round(hrp_m/hrp_s,2))
print('Max Drawdown %:', round(hrp_pf['max_drawdown'].min()*100,2))
print('Cumulative Return %:', round(hrp_pf['cum_return'].iloc[-1]*100,2))
print('Sortino Ratio assuming rf = 0:', round(hrp_m/hrp_negs, 2))
print('Calmor Ratio', round(hrp_m/(-hrp_pf['max_drawdown'].min()*100),2))
print('-----------------------------------------------------------------')
----------------------------------------------------------------- Backtest Results for Hierarchical Risk Parity Portfolio Annualized Mean Return %: 18.67 Annualized St Deviation %: 17.24 Sharpe Ratio assuming rf = 0: 1.08 Max Drawdown %: -33.9 Cumulative Return %: 462.22 Sortino Ratio assuming rf = 0: 1.58 Calmor Ratio 0.55 -----------------------------------------------------------------